/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: April 2024

Inputs:		survey_data_050623.csv

Outputs: 	clean_survey.dta

*******************************************************************************/

clear all

insheet using "${data_raw}Survey/survey_data_050623.csv"

* Define sample 

	count 
	
	* Drop preview responses (this was us, testing the survey)
	drop if distributionchannel == "preview" 
	
	* Drop anonymous responses (ie responses not linked to an email)
	drop if distributionchannel == "anonymous"
	
	* Drop responses that Qualtrics codes as spam
	drop if status == "Spam"
	
	count

* Create indicator variables for the two cross-randomized arms

	* Generate a variable for randomization arm 1 (high P vs. low P)
	gen highPotential = (comp11_1 != .)

	* Generate a variable for randomization arm 2 (high comp vs. low comp)
	gen highCompetition = (qual11_1 == .)

* Clean up demographic variables
	
	* Code position and create indicators for each
	rename d1 position
	gen profTenured = position == "Tenure-track professor, tenured" if !missing(position)
	gen profUntenured = position == "Tenure-track professor, untenured" if !missing(position)
	gen profNonTT = position == "Non tenure-track professor" if !missing(position)
	gen trainee = position == "Postdoc" | position == "Graduate student" if !missing(position)
	gen other = position == "Other academic researcher" | position == "Non-academic researcher" if !missing(position)
	
	* Code gender and create indicators
	rename d2 gender
	gen male = gender == "Male" if !missing(gender)
	replace male = . if gender == "Prefer not to specify"
	
	* Code geography and create indicators
	rename d3 continent
	gen africa = continent == "Africa" if !missing(continent)
	gen asia = continent == "Asia" if !missing(continent)
	gen australia = continent == "Australia / Oceania" if !missing(continent)
	gen europe = continent == "Europe" if !missing(continent)
	gen nAmerica = continent == "North America" if !missing(continent)
	gen sAmerica = continent == "South America" if !missing(continent)
	
* Clean up variables from survey experiment questions

	* Competition measure
	gen competition = comp11_1
	replace competition = comp21_1 if competition == .

	* Maturation measure
	gen maturation = qual11_1
	replace maturation = qual21_1 if maturation == .

	* Various quality measures - translate these to numeric scores
	gen replicate = qual12_1
	replace replicate = qual22_1 if replicate == ""
	gen replicateScore = .
	replace replicateScore = 0 if replicate == "No"
	replace replicateScore = .5 if replicate == "Maybe"
	replace replicateScore = 1 if replicate == "Yes"

	gen additionalExp = qual12_2
	replace additionalExp = qual22_2 if additionalExp == ""
	gen additionalExpScore = .
	replace additionalExpScore = 0 if additionalExp == "No"
	replace additionalExpScore = .5 if additionalExp == "Maybe"
	replace additionalExpScore = 1 if additionalExp == "Yes"

	gen codeReview = qual12_3
	replace codeReview = qual22_3 if codeReview == ""
	gen codeReviewScore = .
	replace codeReviewScore = 0 if codeReview == "No"
	replace codeReviewScore = .5 if codeReview == "Maybe"
	replace codeReviewScore = 1 if codeReview == "Yes"

	gen mathReview = qual12_4
	replace mathReview = qual22_4 if mathReview == ""
	gen mathReviewScore = .
	replace mathReviewScore = 0 if mathReview == "No"
	replace mathReviewScore = .5 if mathReview == "Maybe"
	replace mathReviewScore = 1 if mathReview == "Yes"

	gen proofread = qual12_5
	replace proofread = qual22_5 if proofread == ""
	gen proofreadScore = .
	replace proofreadScore = 0 if proofread == "No"
	replace proofreadScore = .5 if proofread == "Maybe"
	replace proofreadScore = 1 if proofread == "Yes"

	gen litReview = qual12_6
	replace litReview = qual22_6 if litReview == ""
	gen litReviewScore = .
	replace litReviewScore = 0 if litReview == "No"
	replace litReviewScore = .5 if litReview == "Maybe"
	replace litReviewScore = 1 if litReview == "Yes"


* Create a simple index that is the mean of all quality scores (NA excluded)
egen qualityIndexScore = rmean(replicateScore additionalExpScore 			///
	codeReviewScore mathReviewScore proofreadScore litReviewScore)



* Structural biologists linked to PDB are a subsample of our bigger structural	
* biology group. Break these into to two separate groups: the big group and the
* subsample (so some observations will be in both groups)

* Duplicate the PDB-linked respondents 
expand 2 if l0_l1 == "PDB_linked", gen(dupindicator)
	
* Structural biology - PDB includes these PDB-linked respondents
replace l0_l1 = "structural biology PDB" if l0_l1 == "PDB_linked" & 		///
	dupindicator == 1
	
* Structural biology - all includes non-PDB-linked researchers +  			///
* these PDB-linked researchers
replace l0_l1 = "structural biology all" if l0_l1 == "PDB_broad" | 			///
	l0_l1 == "PDB_linked" & dupindicator == 0


* Create a numeric field variable with labels

gen field = .
replace field = 0 if l0_l1 == "structural biology PDB"
replace field = 1 if l0_l1 == "structural biology all"
replace field = 2 if l0_l1 == "biology|cell biology"
replace field = 3 if l0_l1 == "biology|ecology"
replace field = 4 if l0_l1 == "biology|horticulture-agronomy"
replace field = 5 if l0_l1 == "biology|immunology"
replace field = 6 if l0_l1 == "chemistry|biochemistry"
replace field = 7 if l0_l1 == "chemistry|inorganic chemistry"
replace field = 8 if l0_l1 == "physics|condensed matter physics"
replace field = 9 if l0_l1 == "physics|optics"
replace field = 10 if l0_l1 == "psychology|social psychology"

label define field 1 "structural biology" 2 "cell biology" 3 "ecology" 		///
	4 "horticulture" 5 "immunology" 6 "biochemistry" 7 "inorganic chem" 	///
	8 "cond matter physics" 9 "optics" 10 "social psychology"
	
save "${data_clean}clean_survey.dta", replace


